This is an extensive Exploratory Data Analysis for the Recruit Restaurant Visitor Forecasting competition with the powers of tidy R and ggplot2. Also We tried to translate the main R code to Python.
The aim of this challenge is to predict the future numbers of restaurant visitors. This makes it a Time Series Forecasting problem. The data was collected from Japanese restaurants. As we will see, the data set is small and easily accessible without requiring much memory or computing power. Therefore, this competition is particularly suited for beginners.
The data comes in the shape of 8 relational files which are derived from two separate Japanese websites that collect user information: “Hot Pepper Gourmet (hpg): similar to Yelp” (search and reserve) and “AirREGI / Restaurant Board (air): similar to Square” (reservation control and cash register). The training data is based on the time range of Jan 2016 - most of Apr 2017, while the test set includes the last week of Apr plus May 2017. The test data “intentionally spans a holiday week in Japan called the ‘Golden Week.’ The data description further notes that:”There are days in the test set where the restaurant were closed and had no visitors. These are ignored in scoring. The training set omits days where the restaurants were closed.”
Those are the individual files:
air_visit_data.csv: historical visit data for the air restaurants. This is essentially the main training data set.
air_reserve.csv / hpg_reserve.csv: reservations made through the air / hpg systems.
air_store_info.csv / hpg_store_info.csv: details about the air / hpg restaurants including genre and location.
store_id_relation.csv: connects the air and hpg ids
date_info.csv: essentially flags the Japanese holidays.
sample_submission.csv: serves as the test set. The id is formed by combining the air id with the visit date.
Thank you all for the upvotes, critical feedback, and continued support!
# Set python environment and version in RStudio ;-)
reticulate::use_python("/usr/bin/python3.10", required = TRUE)
reticulate::py_config()## python: /usr/bin/python3.10
## libpython: /usr/lib/python3.10/config-3.10-x86_64-linux-gnu/libpython3.10.so
## pythonhome: //usr://usr
## version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
## numpy: /home/kirus/.local/lib/python3.10/site-packages/numpy
## numpy_version: 1.26.0
##
## NOTE: Python version was forced by use_python() function
## 'en_US.utf8'
## [1] "/home/kirus/R/x86_64-pc-linux-gnu-library/4.3"
## [2] "/usr/local/lib/R/site-library"
## [3] "/usr/lib/R/site-library"
## [4] "/usr/lib/R/library"
## /usr/lib/R
# os.environ['R_HOME'] = "/home/kirus/R/x86_64-pc-linux-gnu-library/4.3"
# print(os.environ['R_HOME'])We need to set R libraries path when called by Python.
Two paths will be used:
/usr/lib/R/library for packages installed with root
profile during installing R base,
/home/kirus/R/x86_64-pc-linux-gnu-library/4.3 for
packages installed by user without root profile.
import rpy2.rinterface
#rpy2.rinterface.set_initoptions((b'rpy2', b'--no-save', b'--no-restore', b'--quiet'))
from rpy2.robjects.packages import importr## /home/kirus/.local/lib/python3.10/site-packages/rpy2/rinterface_lib/embedded.py:276: UserWarning: R was initialized outside of rpy2 (R_NilValue != NULL). Trying to use it nevertheless.
## warnings.warn(msg)
## R was initialized outside of rpy2 (R_NilValue != NULL). Trying to use it nevertheless.
We load a range of libraries for general data wrangling and general visualisation together with more specialised tools.
# general visualisation
library('ggplot2') # visualisation
library('scales') # visualisation
library('grid') # visualisation
library('gridExtra') # visualisation
library('RColorBrewer') # visualisation
library('corrplot') # visualisation
# general data manipulation
library('dplyr') # data manipulation
library('readr') # input/output
library('data.table') # data manipulation
library('DT') # display table
library('tibble') # data wrangling
library('tidyr') # data wrangling
library('stringr') # string manipulation
library('forcats') # factor manipulation
# specific visualisation
library('ggrepel') # visualisation
library('ggridges') # visualisation
library('ggExtra') # visualisation
library('ggforce') # visualisation
library('viridis') # visualisation
# specific data manipulation
library('lazyeval') # data wrangling
library('broom') # data wrangling
library('purrr') # string manipulation
# Date plus forecast
library('lubridate') # date and time
library('timeDate') # date and time
library('tseries') # time series analysis
library('forecast') # time series analysis
#library('prophet') # time series analysis
library('timetk') # time series analysis
# Maps / geospatial
library('geosphere') # geospatial locations
library('leaflet') # maps
library('leaflet.extras') # maps
library('maps') # maps
library(reticulate) # interface to python#pip3 install matplotlib
#pip3 install seaborn
#pip3 install scikit-learn
#pip3 install rpy2
# pip install git+https://github.com/h2oai/datatable
# pip install ipywidgets==7.0.1
from rpy2 import robjects
from rpy2.robjects.packages import importr, data
import rpy2.robjects.packages as rpackages
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import locale
from statsmodels.nonparametric.smoothers_lowess import lowess
from matplotlib.gridspec import GridSpec
import matplotlib.dates as mdates
from matplotlib.ticker import FormatStrFormatter
import folium
from folium.plugins import MarkerCluster
#from IPython.display import display, HTML # displays html tobale in Rmarkdown
#import ipywidgets
#import qgrid # display dataframe like DT::datatable in Rmarkdown document
#import datatable as dt
#from sklearn.model_selection import train_test_split
from collections import Counter
from datetime import datetime## Load iris from python
#iris = sns.load_dataset('iris')
# load iris from R
datasets = rpackages.importr('datasets',
lib_loc= '/usr/lib/R/library/')
iris = data(datasets).fetch('iris')['iris']
# convert R dataframe to pandas df
# (https://rpy2.github.io/doc/latest/html/generated_rst/pandas.html)
with (ro.default_converter + pandas2ri.converter).context():
iris = ro.conversion.get_conversion().rpy2py(iris)
#sns.set_style("darkgrid")
plt.style.use('ggplot')
#fi = sns.FacetGrid(iris, hue ="Species",
# height = 6).map(plt.scatter,
# 'Petal.Length',
# 'Petal.Width').add_legend()
fi=sns.relplot(x="Petal.Length",
y="Petal.Width",
data=iris, alpha=0.8,
hue='Species')
fi.fig.set_figheight(4)
fi.fig.set_figwidth(8)
plt.show()We use the multiplot function, courtesy of R Cookbooks to create multi-panel plots. We also make use of a brief helper function to compute binomial confidence intervals.
# Define multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}We use matplotlib to create subplots. The function multiplot now
works with *args to accept an arbitrary number of plots,
and the cols argument determines the number of columns in the layout. If
layout is not provided, it is calculated from cols and the number of
plots.
The example at the end demonstrates how to use the multiplot function with two plots (y1 and y2) arranged in two columns.
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
def multiplot(*args, plotlist=None, file=None, cols=1, layout=None):
plots = list(args) + (plotlist if plotlist else [])
num_plots = len(plots)
if layout is None:
rows = (num_plots - 1) // cols + 1
layout = [(i // cols, i % cols) for i in range(num_plots)]
if num_plots == 1:
plt.subplot2grid((rows, cols), (0, 0))
plt.plot(plots[0])
else:
fig = plt.figure()
gs = GridSpec(rows, cols)
for i, plot in enumerate(plots):
ax = fig.add_subplot(gs[layout[i]])
ax.plot(plot)
if file:
plt.savefig(file)
else:
plt.show()
# Example usage
#import numpy as np
#x = np.linspace(0, 10, 100)
#y1 = np.sin(x)
#y2 = np.cos(x)
#multiplot(y1, y2, cols=2)An other version using add_gridspec to create a layout
for multiple plots:
def multiplot(*args, plotlist=None, file=None, cols=1, layout=None):
plots = list(args) + (plotlist if plotlist else [])
num_plots = len(plots)
if layout is None:
rows = (num_plots - 1) // cols + 1
fig = plt.figure()
if num_plots == 1:
plt.subplot2grid((rows, cols), (0, 0), colspan=cols)
plt.plot(plots[0])
else:
gs = GridSpec(rows, cols)
for i, plot in enumerate(plots):
ax = fig.add_subplot(gs[i // cols, i % cols])
ax.plot(plot)
if file:
plt.savefig(file)
else:
plt.show()air_visits <- data.table::fread(str_c("data/air_visit_data.csv"))
air_reserve <- data.table::fread(str_c('data/air_reserve.csv'))
air_store <- data.table::fread(str_c('data/air_store_info.csv'))
holidays <- data.table::fread(str_c('data/date_info.csv'))
hpg_reserve <- data.table::fread(str_c('data/hpg_reserve.csv'))
hpg_store <- data.table::fread(str_c('data/hpg_store_info.csv'))
store_ids <- data.table::fread(str_c('data/store_id_relation.csv'))
test <- data.table::fread(str_c('data/sample_submission.csv'))In this Python code, we use the pandas library to read the CSV files. Each read_csv function call reads a CSV file into a DataFrame. The variable names (air_visit, air_reserve, etc.) will be DataFrames that you can use for data manipulation and analysis.
air_visits = pd.read_csv("data/air_visit_data.csv")
air_reserve = pd.read_csv("data/air_reserve.csv")
air_store = pd.read_csv("data/air_store_info.csv")
holidays = pd.read_csv("data/date_info.csv")
hpg_reserve = pd.read_csv("data/hpg_reserve.csv")
hpg_store = pd.read_csv("data/hpg_store_info.csv")
store_ids = pd.read_csv("data/store_id_relation.csv")
test = pd.read_csv("data/sample_submission.csv")As a first step let’s have an overview of the data sets.
We find that this file contains the visitors numbers for each visit_date and air_store_id. The date feature should be transformed into a time-series format. There are 829 different stores, which is a small data set:
## [1] 829
qgrid and ipywidgets verions are not compatible and have a issue to
render html table like DT::datatable().
#air_visits = pd.DataFrame(air_visits)
# Show the DataFrame using qgrid in Python
#qgrid_widget = qgrid.show_grid(air_visits, show_toolbar=True)
#qgrid_widget.head(10)
#html = air_visits.head(10).to_html()
#display(HTML(html))
#print(html)
air_visits.head(10)## air_store_id visit_date visitors
## 0 air_ba937bf13d40fb24 2016-01-13 25
## 1 air_ba937bf13d40fb24 2016-01-14 32
## 2 air_ba937bf13d40fb24 2016-01-15 29
## 3 air_ba937bf13d40fb24 2016-01-16 22
## 4 air_ba937bf13d40fb24 2016-01-18 6
## 5 air_ba937bf13d40fb24 2016-01-19 9
## 6 air_ba937bf13d40fb24 2016-01-20 31
## 7 air_ba937bf13d40fb24 2016-01-21 21
## 8 air_ba937bf13d40fb24 2016-01-22 18
## 9 air_ba937bf13d40fb24 2016-01-23 26
## 829
We find that the air reservations include the date and time of the reservation, as well as those of the visit. We have reservation numbers for 314 air stores:
## [1] 314
## air_store_id ... reserve_visitors
## 0 air_877f79706adbfb06 ... 1
## 1 air_db4b38ebe7a7ceff ... 3
## 2 air_db4b38ebe7a7ceff ... 6
## 3 air_877f79706adbfb06 ... 2
## 4 air_db80363d35f10926 ... 5
## 5 air_db80363d35f10926 ... 2
## 6 air_db80363d35f10926 ... 4
## 7 air_3bb99a1fe0583897 ... 2
## 8 air_3bb99a1fe0583897 ... 2
## 9 air_2b8b29ddfd35018e ... 2
##
## [10 rows x 4 columns]
## 314
The hpg reservations file follows the same structure as the corresponding air file. We have reservation numbers for 13325 hpg stores:
## [1] 13325
## hpg_store_id ... reserve_visitors
## 0 hpg_c63f6f42e088e50f ... 1
## 1 hpg_dac72789163a3f47 ... 3
## 2 hpg_c8e24dcf51ca1eb5 ... 2
## 3 hpg_24bb207e5fd49d4a ... 5
## 4 hpg_25291c542ebb3bc2 ... 13
## 5 hpg_28bdf7a336ec6a7b ... 2
## 6 hpg_2a01a042bca04ad9 ... 2
## 7 hpg_2a84dd9f4c140b82 ... 2
## 8 hpg_2ad179871696901f ... 2
## 9 hpg_2c1d989eedb0ff83 ... 6
##
## [10 rows x 4 columns]
## 13325
We find that the air_store info includes the name of the particular cuisine along with the name of the area.
## air_store_id air_genre_name ... latitude longitude
## 0 air_0f0cdeee6c9bf3d7 Italian/French ... 34.695124 135.197853
## 1 air_7cc17a324ae5c7dc Italian/French ... 34.695124 135.197853
## 2 air_fee8dcf4d619598e Italian/French ... 34.695124 135.197853
## 3 air_a17f0778617c76e2 Italian/French ... 34.695124 135.197853
## 4 air_83db5aff8f50478e Italian/French ... 35.658068 139.751599
## 5 air_99c3eae84130c1cb Italian/French ... 35.658068 139.751599
## 6 air_f183a514cb8ff4fa Italian/French ... 35.658068 139.751599
## 7 air_6b9fa44a9cf504a1 Italian/French ... 35.658068 139.751599
## 8 air_0919d54f0c9a24b8 Italian/French ... 35.658068 139.751599
## 9 air_2c6c79d597e48096 Italian/French ... 35.658068 139.751599
##
## [10 rows x 5 columns]
Again, the hpg_store info follows the same structure as the air info. Here the genre_name includes the word style. It’s worth checking whether the same is true for the air data or whether it just refers to the specific “Japanese style”. There are 4690 different hpg_store_ids, which are significantly fewer than we have reservation data for.
## hpg_store_id hpg_genre_name ... latitude longitude
## 0 hpg_6622b62385aec8bf Japanese style ... 35.643675 139.668221
## 1 hpg_e9e068dd49c5fa00 Japanese style ... 35.643675 139.668221
## 2 hpg_2976f7acb4b3a3bc Japanese style ... 35.643675 139.668221
## 3 hpg_e51a522e098f024c Japanese style ... 35.643675 139.668221
## 4 hpg_e3d0e1519894f275 Japanese style ... 35.643675 139.668221
## 5 hpg_530cd91db13b938e Japanese style ... 35.643675 139.668221
## 6 hpg_02457b318e186fa4 Japanese style ... 35.643675 139.668221
## 7 hpg_0cb3c2c490020a29 Japanese style ... 35.643675 139.668221
## 8 hpg_3efe9b08c887fe9a Japanese style ... 35.643675 139.668221
## 9 hpg_765e8d3ba261dc1c Japanese style ... 35.643675 139.668221
##
## [10 rows x 5 columns]
We called the date_info file holidays, because that’s essentially the information it contains. Holidays are encoded as binary flags in integer format. This should become a logical binary feature for exploration purposes.
## calendar_date day_of_week holiday_flg
## 0 2016-01-01 Friday 1
## 1 2016-01-02 Saturday 1
## 2 2016-01-03 Sunday 1
## 3 2016-01-04 Monday 0
## 4 2016-01-05 Tuesday 0
## 5 2016-01-06 Wednesday 0
## 6 2016-01-07 Thursday 0
## 7 2016-01-08 Friday 0
## 8 2016-01-09 Saturday 0
## 9 2016-01-10 Sunday 0
This is a relational file that connects the air and hpg ids. There are only 150 pairs, which is less than 20% of all air stores.
## air_store_id hpg_store_id
## 0 air_63b13c56b7201bd9 hpg_4bc649e72e2a239a
## 1 air_a24bf50c3e90d583 hpg_c34b496d0305a809
## 2 air_c7f78b4f3cba33ff hpg_cd8ae0d9bbd58ff9
## 3 air_947eb2cae4f3e8f2 hpg_de24ea49dc25d6b8
## 4 air_965b2e0cf4119003 hpg_653238a84804d8e7
## 5 air_a38f25e3399d1b25 hpg_50378da9ffb9b6cd
## 6 air_3c938075889fc059 hpg_349b1b92f98b175e
## 7 air_68301bcb11e2f389 hpg_2c09f3abb2220659
## 8 air_5f6fa1b897fe80d5 hpg_40aff6385800ebb1
## 9 air_00a91d42b08b08d9 hpg_fbe603376b5980fc
As noted in the data description, the id of the final submission file is a concatenation of the air_id and the date.
## id visitors
## 0 air_00a91d42b08b08d9_2017-04-23 0
## 1 air_00a91d42b08b08d9_2017-04-24 0
## 2 air_00a91d42b08b08d9_2017-04-25 0
## 3 air_00a91d42b08b08d9_2017-04-26 0
## 4 air_00a91d42b08b08d9_2017-04-27 0
## 5 air_00a91d42b08b08d9_2017-04-28 0
## 6 air_00a91d42b08b08d9_2017-04-29 0
## 7 air_00a91d42b08b08d9_2017-04-30 0
## 8 air_00a91d42b08b08d9_2017-05-01 0
## 9 air_00a91d42b08b08d9_2017-05-02 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
There are no missing values in our data. Excellent.
In Python, you can use the isna() method from the pandas library to check for missing values in a DataFrame, and then use sum() to count them. Here’s the equivalent code:
## 0
## 0
## 0
## 0
## 0
## 0
## 0
test.isna() returns a DataFrame of the same shape as
test with True where the original DataFrame has missing values, and
False otherwise.
.sum() is used twice. The first sum() computes the
count of missing values for each column, and the second sum() adds up
those counts to get the total number of missing values in the entire
DataFrame.
The final result, missing_values_count, will be the equivalent of sum(is.na(test)) in R.
We change the formatting of the date/time features and also reformat a few features to logical and factor variables for exploration purposes.
air_visits <- air_visits %>%
mutate(visit_date = ymd(visit_date))
air_reserve <- air_reserve %>%
mutate(visit_datetime = as.POSIXct(visit_datetime,format="%Y-%m-%dT%H:%MZ"), #ymd_hms(visit_datetime),
reserve_datetime = as.POSIXct(reserve_datetime,format="%Y-%m-%dT%H:%MZ")) #ymd_hms(reserve_datetime)
hpg_reserve <- hpg_reserve %>%
mutate(visit_datetime = as.POSIXct(visit_datetime,format="%Y-%m-%dT%H:%MZ"),
reserve_datetime = as.POSIXct(reserve_datetime,format="%Y-%m-%dT%H:%MZ"))
air_store <- air_store %>%
mutate(air_genre_name = as.factor(air_genre_name),
air_area_name = as.factor(air_area_name))
hpg_store <- hpg_store %>%
mutate(hpg_genre_name = as.factor(hpg_genre_name),
hpg_area_name = as.factor(hpg_area_name))
holidays <- holidays %>%
mutate(holiday_flg = as.logical(holiday_flg),
date = ymd(calendar_date),
calendar_date = as.character(calendar_date))# Convert visit_date column to datetime format
air_visits['visit_date'] = pd.to_datetime(air_visits['visit_date'])
# Convert visit_datetime and reserve_datetime columns to datetime format
air_reserve['visit_datetime'] = pd.to_datetime(air_reserve['visit_datetime'])
air_reserve['reserve_datetime'] = pd.to_datetime(air_reserve['reserve_datetime'])
hpg_reserve['visit_datetime'] = pd.to_datetime(hpg_reserve['visit_datetime'])
hpg_reserve['reserve_datetime'] = pd.to_datetime(hpg_reserve['reserve_datetime'])
# Convert categorical columns to factors (categorical variables in pandas)
air_store['air_genre_name'] = air_store['air_genre_name'].astype('category')
air_store['air_area_name'] = air_store['air_area_name'].astype('category')
hpg_store['hpg_genre_name'] = hpg_store['hpg_genre_name'].astype('category')
hpg_store['hpg_area_name'] = hpg_store['hpg_area_name'].astype('category')
# Convert holiday_flg to boolean, date to datetime, and calendar_date to string
holidays['holiday_flg'] = holidays['holiday_flg'].astype(bool)
holidays['date'] = pd.to_datetime(holidays['calendar_date'])
holidays['calendar_date'] = holidays['calendar_date'].astype(str)Here we have a first look at the distributions of the feature in our individual data files before combining them for a more detailed analysis. This inital visualisation will be the foundation on which we build our analysis.
We start with the number of visits to the air restaurants. Here we plot the total number of visitors per day over the full training time range together with the median visitors per day of the week and month of the year:
Sys.setenv(LANG = "en_US.UTF-8")
p1 <- air_visits %>%
group_by(visit_date) %>%
summarise(all_visitors = sum(visitors)) %>%
ggplot(aes(visit_date,all_visitors)) +
geom_line(col = "blue") +
labs(y = "All visitors", x = "Date")
p2 <- air_visits %>%
ggplot(aes(visitors)) +
geom_vline(xintercept = 20, color = "orange") +
geom_histogram(fill = "blue", bins = 30) +
scale_x_log10()
p3 <- air_visits %>%
mutate(wday = wday(visit_date, label = TRUE, week_start = 1)) %>%
group_by(wday) %>%
summarise(visits = median(visitors)) %>%
ggplot(aes(wday, visits, fill = wday)) +
geom_col() +
theme(legend.position = "none", axis.text.x = element_text(angle=45, hjust=1, vjust=0.9)) +
labs(x = "Day of the week", y = "Median visitors") +
scale_fill_hue()
p4 <- air_visits %>%
mutate(month = month(visit_date, label = TRUE)) %>%
group_by(month) %>%
summarise(visits = median(visitors)) %>%
ggplot(aes(month, visits, fill = month)) +
geom_col() +
theme(legend.position = "none", axis.text.x = element_text(angle=45, hjust=1, vjust=0.9)) +
labs(x = "Month", y = "Median visitors")
layout <- matrix(c(1,1,1,1,2,3,4,4),2,4,byrow=TRUE)
multiplot(p1, p2, p3, p4, layout=layout)We find: * There is an interesting long-term step structure in the overall time series. This might be related to new restaurants being added to the data base. In addition, we already see a periodic pattern that most likely corresponds to a weekly cycle.
The number of guests per visit per restaurant per day peaks at around 20 (the orange line). The distribution extends up to 100 and, in rare cases, beyond.
Friday and the weekend appear to be the most popular days; which
is to be expected. Monday and Tuesday have the
lowest numbers of average visitors.
Also during the year there is a certain amount of variation.
Dec appears to be the most popular month for restaurant
visits. The period of Mar - May is consistently
busy.
We will be forecasting for the last week of April plus May 2017, so let’s look at this time range in our 2016 training data:
air_visits %>%
filter(visit_date > ymd("2016-04-15") & visit_date < ymd("2016-06-15")) %>%
group_by(visit_date) %>%
summarise(all_visitors = sum(visitors)) %>%
ggplot(aes(visit_date,all_visitors)) +
geom_line() +
geom_smooth(method = "loess", color = "blue", span = 1/7) +
labs(y = "All visitors", x = "Date")## `geom_smooth()` using formula = 'y ~ x'
Here, the black line is the date and the blue line corresponds to a smoothing fit with a corresponding grey confidence area. We see again the weekly period and also the impact of the aforementioned Golden Week, which in 2016 happened between Apr 29 and May 5.
Seaborn doesn’t have a direct equivalent to the
geom_smooth function with the loess method. However, we can
achieve a similar effect using the seaborn.lineplot function combined
with a lowess smoother from the statsmodels
library.
## 'en_US.utf8'
# Set the language to English
#plt.rcParams['axes.formatter.use_locale'] = False
#plt.rcParams['pgf.rcfonts'] = False
#plt.rcParams['pdf.fonttype'] = 42
p1_data = air_visits.groupby('visit_date')['visitors'].sum().reset_index()
p3_data = air_visits.assign(wday=air_visits['visit_date'].dt.day_name()).groupby('wday')['visitors'].median().reset_index()
p4_data = air_visits.assign(month=air_visits['visit_date'].dt.strftime('%b')).groupby('month')['visitors'].median().reset_index()
plt.style.use('ggplot')
fig = plt.figure(figsize=(11, 6))
gs = fig.add_gridspec(2, 3)
# First row
ax1 = fig.add_subplot(gs[0, :])
sns.lineplot(data=p1_data, x='visit_date', y='visitors', color='blue', ax=ax1)
ax1.set_xlabel('All visitors')
ax1.set_ylabel('Date')
# Second row
ax2 = fig.add_subplot(gs[1, 0])
plt.axvline(x=20, color='orange')
plt.xscale('log')
sns.histplot(data=air_visits, x='visitors', bins=30, color='blue', kde=True, ax=ax2)
ax3 = fig.add_subplot(gs[1, 1])
order = [ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
ax3.set_xticklabels( ('Mon', 'Tue','Wed', 'Thur', 'Fri', 'Sat', 'Sun') )
p3 = sns.barplot(data=p3_data, x='wday', y='visitors', hue='wday',
legend=False, ax=ax3, order=order) #palette='viridis',
p3 = plt.xticks(rotation=45)
#ax3.set_xticklabels(ax3.get_xticklabels(), rotation=45)
ax3.set_xlabel('Days')
ax4 = fig.add_subplot(gs[1, 2])
order = [ "Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
p4 = sns.barplot(data=p4_data, x='month', y='visitors', hue='month',
legend=False, ax=ax4, order=order, palette='viridis')
p4 = plt.xticks(rotation=45)
#ax4.set_xticklabels(ax4.get_xticklabels(), rotation=45)
ax4.set_xlabel('Months')
plt.tight_layout()
plt.show()In this code, we use add_gridspec to create a 2x3 grid of subplots.
Then, we manually add subplots to specific positions using add_subplot.
This way, you have full control over the layout of the plots.
from statsmodels.nonparametric.smoothers_lowess import lowess
import matplotlib.dates as mdates
#locale.setlocale(locale.LC_ALL,'en_US.utf8')
plt.style.use('ggplot')
# Assuming air_visits is your DataFrame
subset_data = air_visits[(air_visits['visit_date'] > '2016-04-15') &
(air_visits['visit_date'] < '2016-06-15')]
# Calculate the total visitors for each visit_date
agg_data = subset_data.groupby('visit_date')['visitors'].sum().reset_index()
#agg_data.visit_date = agg_data.visit_date.dt.strftime('%d %b')
# build the figure
fig, ax = plt.subplots()
# Scatter plot of the data
p = sns.lineplot(data = agg_data, x='visit_date', y='visitors',
color='black', label='Data')
#p= sns.regplot(data=agg_data, x='visit_date', y='visitors',
# color='black', lowess=True, ax=plt.gca())
# set dates format
p = ax.set(xticklabels = agg_data['visit_date'].dt.strftime('%d %b'))## <string>:5: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
p = plt.fill_between(agg_data.visit_date, agg_data.visitors*0.8,
agg_data.visitors*1.2,
alpha=0.25, label='LOWESS uncertainty',
color = "grey")
# put the labels at 45deg since they tend to be too long
#p = plt.xticks(rotation=45)
fig.autofmt_xdate()
# # Ensure that the x.axis text is spacieux
p = ax.xaxis.set_major_locator(mdates.AutoDateLocator())
## Calculate the lowess smoothed line
smoothed = lowess(agg_data['visitors'],
agg_data['visit_date'].astype('datetime64[s]'), frac=1/7)
## Plot the smoothed line
p = plt.plot(agg_data['visit_date'], smoothed[:, 1],
color='blue', label='Lowess Smoother')
# Set labels and legend
p = plt.xlabel('Date')
p = plt.ylabel('All visitors')
plt.legend("")
# Show the plot
plt.show(p)Let’s see how our reservations data compares to the actual visitor numbers. We start with the air restaurants and visualise their visitor volume through reservations for each day, alongside the hours of these visits and the time between making a reservation and visiting the restaurant:
foo <- air_reserve %>%
mutate(reserve_date = date(reserve_datetime),
reserve_hour = hour(reserve_datetime),
reserve_wday = wday(reserve_datetime, label = TRUE, week_start = 1),
visit_date = date(visit_datetime),
visit_hour = hour(visit_datetime),
visit_wday = wday(visit_datetime, label = TRUE, week_start = 1),
diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
diff_day = time_length(visit_datetime - reserve_datetime, unit = "day")
)
p1 <- foo %>%
group_by(visit_date) %>%
summarise(all_visitors = sum(reserve_visitors)) %>%
ggplot(aes(visit_date, all_visitors)) +
geom_line() +
labs(x = "Air visit date")
p2 <- foo %>%
group_by(visit_hour) %>%
summarise(all_visitors = sum(reserve_visitors)) %>%
ggplot(aes(visit_hour, all_visitors)) +
geom_col(fill = "blue")
p3 <- foo %>%
filter(diff_hour < 24*5) %>%
group_by(diff_hour) %>%
summarise(all_visitors = sum(reserve_visitors)) %>%
ggplot(aes(diff_hour, all_visitors)) +
geom_col(fill = "blue") +
labs(x = "Time from reservation to visit (hours)")
layout <- matrix(c(1,1,2,3),2,2, byrow=TRUE)
multiplot(p1, p2, p3, layout=layout)# Assuming 'air_reserve' is your DataFrame
foo = air_reserve.copy()
foo['reserve_date'] = foo['reserve_datetime'].dt.date
foo['reserve_hour'] = foo['reserve_datetime'].dt.hour
foo['reserve_wday'] = foo['reserve_datetime'].dt.day_name()
foo['visit_date'] = foo['visit_datetime'].dt.date
foo['visit_hour'] = foo['visit_datetime'].dt.hour
foo['visit_wday'] = foo['visit_datetime'].dt.day_name()
foo['diff_hour'] = (foo['visit_datetime'] - foo['reserve_datetime']).dt.total_seconds() / 3600
foo['diff_day'] = (foo['visit_datetime'] - foo['reserve_datetime']).dt.days
p1 = foo.groupby('visit_date')['reserve_visitors'].sum().reset_index()
p2 = foo.groupby('visit_hour')['reserve_visitors'].sum().reset_index()
p3 = foo[foo['diff_hour'] < 24*5].groupby('diff_hour')['reserve_visitors'].sum().reset_index()
# Multiplot layout
fig = plt.figure(figsize=(9, 5))
gs = GridSpec(2, 2, figure=fig)
plt.style.use('ggplot')
ax1 = fig.add_subplot(gs[0, :])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])
p1.plot(x='visit_date', y='reserve_visitors', kind='line', ax=ax1, color = 'black')
p1= ax1.set_xlabel('air visit date')
p1= plt.legend("")
p2.plot(x='visit_hour', y='reserve_visitors', kind='bar', color='blue', ax=ax2)
p2= ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)
p2= ax2.xaxis.set_major_locator(mdates.AutoDateLocator())
p2= plt.legend("")
p3.plot(x='diff_hour', y='reserve_visitors', kind='bar', color='blue', ax=ax3)
p3= ax3.set_xlabel('Time from reservation to visit (hours)')
p3= ax3.set_xticklabels(ax3.get_xticklabels(), rotation=0)
p3= ax3.xaxis.set_major_locator(mdates.AutoDateLocator())
p3= ax3.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
p3= plt.legend("")
plt.tight_layout()
plt.show()In the same style as above, here are the hpg reservations:
foo <- hpg_reserve %>%
mutate(reserve_date = date(reserve_datetime),
reserve_hour = hour(reserve_datetime),
visit_date = date(visit_datetime),
visit_hour = hour(visit_datetime),
diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
diff_day = time_length(visit_datetime - reserve_datetime, unit = "day")
)
p1 <- foo %>%
group_by(visit_date) %>%
summarise(all_visitors = sum(reserve_visitors)) %>%
ggplot(aes(visit_date, all_visitors)) +
geom_line() +
labs(x = "HPG visit date")
p2 <- foo %>%
group_by(visit_hour) %>%
summarise(all_visitors = sum(reserve_visitors)) %>%
ggplot(aes(visit_hour, all_visitors)) +
geom_col(fill = "red")
p3 <- foo %>%
filter(diff_hour < 24*5) %>%
group_by(diff_hour) %>%
summarise(all_visitors = sum(reserve_visitors)) %>%
ggplot(aes(diff_hour, all_visitors)) +
geom_col(fill = "red") +
labs(x = "Time from reservation to visit (hours)")
layout <- matrix(c(1,1,2,3),2,2,byrow=TRUE)
multiplot(p1, p2, p3, layout=layout)We find:
Here the visits after reservation follow a more orderly pattern, with a clear spike in Dec 2016. As above for the air data, we also see reservation visits dropping off as we get closer to the end of the time frame.
Again, most reservations are for dinner, and we see another nice 24-hour pattern for making these reservations. It’s worth noting that here the last few hours before the visit don’t see more volume than the 24 or 48 hours before. This is in stark constrast to the air data.
foo = hpg_reserve.copy()
foo['reserve_date'] = foo['reserve_datetime'].dt.date
foo['reserve_hour'] = foo['reserve_datetime'].dt.hour
foo['visit_date'] = foo['visit_datetime'].dt.date
foo['visit_hour'] = foo['visit_datetime'].dt.hour
foo['diff_hour'] = (foo['visit_datetime'] - foo['reserve_datetime']).dt.total_seconds() / 3600
foo['diff_day'] = (foo['visit_datetime'] - foo['reserve_datetime']).dt.days
p1 = foo.groupby('visit_date')['reserve_visitors'].sum().reset_index()
p2 = foo.groupby('visit_hour')['reserve_visitors'].sum().reset_index()
p3 = foo[foo['diff_hour'] < 24*5].groupby('diff_hour')['reserve_visitors'].sum().reset_index()
# Multiplot layout
fig = plt.figure(figsize=(9, 5))
gs = GridSpec(2, 2, figure=fig)
plt.style.use('ggplot')
ax1 = fig.add_subplot(gs[0, :])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])
p1.plot(x='visit_date', y='reserve_visitors', kind='line', color='black', ax=ax1)
p1= ax1.set_xlabel('HPG visit date')
#p1= ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45)
p1= ax1.xaxis.set_major_locator(mdates.AutoDateLocator())
p1= plt.legend("")
p2.plot(x='visit_hour', y='reserve_visitors', kind='bar', color='red', ax=ax2)
p2= ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)
p2= ax2.xaxis.set_major_locator(mdates.AutoDateLocator())
p2= plt.legend("")
p3.plot(x='diff_hour', y='reserve_visitors', kind='bar', color='red', ax=ax3)
p3= ax3.set_xlabel('Time from reservation to visit (hours)')
p3= ax3.set_xticklabels(ax3.get_xticklabels(), rotation=0)
p3= ax3.xaxis.set_major_locator(mdates.AutoDateLocator())
p3= ax3.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
p3= plt.legend("")
plt.tight_layout()
plt.show()After visualising the temporal aspects, let’s now look at the spatial information. Note, that according to the data description the “latitude and longitude are the latitude and longitude of the area to which the store belongs”. This is meant to discourage us from identifying specific restaurants. I would be surprised if nobody tried anyway, though.
This is a fully interactive and zoomable map of all the air restaurants. Click on the clusters to break them up into smaller clusters and ultimately into the individual restaurants, which are labelled by their genre. The map nicely visualises the fact that many restaurants share common coordinates, since those coordinates refer to the area of the restaurant. Click on the single markers to see their air_store_id. The map is powered by the leaflet package, which includes a variety of cool tools for interactive maps. Have fun exploring!
leaflet(air_store) %>%
addTiles() %>%
addProviderTiles("CartoDB.Positron") %>%
addMarkers(~longitude, ~latitude,
popup = ~air_store_id, label = ~air_genre_name,
clusterOptions = markerClusterOptions())Next, we plot the numbers of different types of cuisine (or air_genre_names) alongside the areas with the most air restaurants:
p1 <- air_store %>%
group_by(air_genre_name) %>%
count() %>%
ggplot(aes(reorder(air_genre_name, n, FUN = min), n, fill = air_genre_name)) +
geom_col() +
coord_flip() +
theme(legend.position = "none") +
labs(x = "Type of cuisine (air_genre_name)", y = "Number of air restaurants")
p2 <- air_store %>%
group_by(air_area_name) %>%
count() %>%
ungroup() %>%
top_n(15,n) %>%
ggplot(aes(reorder(air_area_name, n, FUN = min) ,n, fill = air_area_name)) +
geom_col() +
theme(legend.position = "none") +
coord_flip() +
labs(x = "Top 15 areas (air_area_name)", y = "Number of air restaurants")
layout <- matrix(c(1,2),2,1,byrow=TRUE)
multiplot(p1, p2, layout=layout)We find:
There are lots of Izakaya gastropubs in our data, followed by Cafe’s. We don’t have many Karaoke places in the air data set and also only a few that describe themselves as generically “International” or “Asian”. I have to admit, I’m kind of intrigued by “creative cuisine”.
Fukuoka has the largest number of air restaurants per area, followed by many Tokyo areas.
In Python, you can achieve a similar map using the folium library.
import warnings
warnings.filterwarnings('ignore')
# Create a map centered at a specific location
m = folium.Map(location=[35.6528, 139.8395], zoom_start=7, width=1500, height=800)
# Add tiles (you can choose different providers)
folium.TileLayer(tiles='CartoDB positron').add_to(m)## <folium.raster_layers.TileLayer object at 0x7fb354605d50>
# Create a MarkerCluster object
marker_cluster = MarkerCluster().add_to(m)
## Add markers from the air_store dataset
# for index, row in air_store.iterrows():
# folium.Marker(location=[row['latitude'], row['longitude']],
# popup=row['air_store_id'],
# icon=None,
# tooltip=row['air_genre_name']).add_to(marker_cluster)
# Define a function to add markers
def add_marker(row):
folium.Marker(location=[row['latitude'], row['longitude']],
popup=row['air_store_id'],
icon=None,
tooltip=row['air_genre_name']).add_to(marker_cluster)
# Apply the function to each row of the DataFrame
air_store.apply(add_marker, axis=1)## 0 None
## 1 None
## 2 None
## 3 None
## 4 None
## ...
## 824 None
## 825 None
## 826 None
## 827 None
## 828 None
## Length: 829, dtype: object
p1_data = air_store['air_genre_name'].value_counts().reset_index()
p1_data.columns = ['air_genre_name', 'count']
p1_data = p1_data.sort_values(by=['count'], ascending=False).reset_index(drop=True)
p2_data = air_store['air_area_name'].value_counts().nlargest(15).reset_index()
p2_data.columns = ['air_area_name', 'count']
p2_data = p2_data.sort_values(['count'], ascending=False).reset_index(drop=True)
# Multiplot layout
fig = plt.figure(figsize=(10, 5))
gs = GridSpec(2, 2, figure=fig)
plt.style.use('ggplot')
ax1 = fig.add_subplot(gs[0, :])
ax2 = fig.add_subplot(gs[1, :])
# Plot 1
p1= sns.barplot(data=p1_data, x='count', y='air_genre_name', order=p1_data['air_genre_name'],
palette='dark', ax=ax1, legend=False)
p1= ax1.set_xlabel('Number of air restaurants')
p1= ax1.set_ylabel("Type of cuisine")
#p1= ax1.set_title("Number of air restaurants by cuisine type")
#p1= ax1.set_xticklabels(ax1.get_xticklabels(), rotation=0)
#p1= ax1.yaxis.set_major_locator(mdates.AutoDateLocator())
#p1= plt.legend("")
# Plot 2
p2= sns.barplot(data=p2_data, x='count', y='air_area_name', order=p2_data['air_area_name'],
palette='bright', ax=ax2, legend=False, width=0.8)
p2= ax2.set_xlabel("Number of air restaurants")
p2= ax2.set_ylabel("Top 15 areas")
#p2= ax2.set_title("Top 15 areas with most air restaurants")
#p2= ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)
#p2= ax2.yaxis.set_major_locator(mdates.AutoDateLocator())
p2= plt.legend("")
plt.tight_layout()
plt.show()In the same way as for the air stores above, we also create an interactive map for the different hpg restaurants:
leaflet(hpg_store) %>%
addTiles() %>%
addProviderTiles("CartoDB.Positron") %>%
addMarkers(~longitude, ~latitude,
popup = ~hpg_store_id, label = ~hpg_genre_name,
clusterOptions = markerClusterOptions())Here is the breakdown of genre and area for the hpg restaurants
p1 <- hpg_store %>%
group_by(hpg_genre_name) %>%
count() %>%
ggplot(aes(reorder(hpg_genre_name, n, FUN = min), n, fill = hpg_genre_name)) +
geom_col() +
coord_flip() +
theme(legend.position = "none") +
labs(x = "Type of cuisine (hpg_genre_name)", y = "Number of hpg restaurants")
p2 <- hpg_store %>%
mutate(area = str_sub(hpg_area_name, 1, 20)) %>%
group_by(area) %>%
count() %>%
ungroup() %>%
top_n(15,n) %>%
ggplot(aes(reorder(area, n, FUN = min) ,n, fill = area)) +
geom_col() +
theme(legend.position = "none") +
coord_flip() +
labs(x = "Top 15 areas (hpg_area_name)", y = "Number of hpg restaurants")
layout <- matrix(c(1,2),1,2,byrow=TRUE)
multiplot(p1, p2, layout=layout)Here we have truncated the hpg_area_name to 20 characters to make the plot more readable.
We find:
The hpg description contains a larger variety of genres than in the air data. Here, “Japanese style” appears to contain many more places that are categorised more specifically in the air data. The same applies to “International cuisine”.
In the top 15 area we find again Tokyo and Osaka to be prominently present.
# Create a map centered at a specific location
m = folium.Map(location=[35.6528, 139.8395], zoom_start=7, width=1500, height=800)
# Add tiles (you can choose different providers)
folium.TileLayer(tiles='CartoDB positron').add_to(m)## <folium.raster_layers.TileLayer object at 0x7fb33df9a800>
# Create a MarkerCluster object
marker_cluster = MarkerCluster().add_to(m)
# Define a function to add markers
def add_marker(row):
folium.Marker(location=[row['latitude'], row['longitude']],
popup=row['hpg_store_id'],
icon=None,
tooltip=row['hpg_genre_name']).add_to(marker_cluster)
# Apply the function to each row of the DataFrame
hpg_store.apply(add_marker, axis=1)## 0 None
## 1 None
## 2 None
## 3 None
## 4 None
## ...
## 4685 None
## 4686 None
## 4687 None
## 4688 None
## 4689 None
## Length: 4690, dtype: object
p1_data = hpg_store['hpg_genre_name'].value_counts().reset_index()
p1_data.columns = ['hpg_genre_name', 'count']
p1_data = p1_data.sort_values(by=['count'], ascending=False).reset_index(drop=True)
p2_data = hpg_store['hpg_area_name'].value_counts().nlargest(15).reset_index()
p2_data.columns = ['hpg_area_name', 'count']
p2_data = p2_data.sort_values(['count'], ascending=False).reset_index(drop=True)
# Multiplot layout
fig = plt.figure(figsize=(12, 8))
gs = GridSpec(2, 2, figure=fig)
plt.style.use('ggplot')
ax1 = fig.add_subplot(gs[:, 0])
ax2 = fig.add_subplot(gs[:, 1])
# Plot 1
p1= sns.barplot(data=p1_data, x='count', y='hpg_genre_name', order=p1_data['hpg_genre_name'],
palette='dark', ax=ax1, legend=False)
p1= ax1.set_xlabel('Number of hpg restaurants')
p1= ax1.set_ylabel("Type of cuisine")
#p1= ax1.set_title("Number of hpg restaurants by cuisine type")
#p1= ax1.set_xticklabels(ax1.get_xticklabels(), rotation=0)
#p1= ax1.yaxis.set_major_locator(mdates.AutoDateLocator())
#p1= plt.legend("")
# Plot 2
p2= sns.barplot(data=p2_data, x='count', y='hpg_area_name', order=p2_data['hpg_area_name'],
palette='bright', ax=ax2, legend=False, width=0.8)
p2= ax2.set_xlabel("Number of hpg restaurants")
p2= ax2.set_ylabel("Top 15 areas")
#p2= ax2.set_title("Top 10 areas with most hpg restaurants")
#p2= ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)
#p2= ax2.yaxis.set_major_locator(mdates.AutoDateLocator())
p2= plt.legend("")
plt.tight_layout()
plt.show()Let’s have a quick look at the holidays. We’ll plot how many there are in total and also how they are distributed during our prediction time range in 2017 and the corresponding time in 2016:
foo <- holidays %>%
mutate(wday = wday(date, week_start = 1))
p1 <- foo %>%
ggplot(aes(holiday_flg, fill = holiday_flg)) +
geom_bar() +
theme(legend.position = "none")
p2 <- foo %>%
filter(date > ymd("2016-04-15") & date < ymd("2016-06-01")) %>%
ggplot(aes(date, holiday_flg, color = holiday_flg)) +
geom_point(size = 2) +
theme(legend.position = "none") +
labs(x = "2016 date")
p3 <- foo %>%
filter(date > ymd("2017-04-15") & date < ymd("2017-06-01")) %>%
ggplot(aes(date, holiday_flg, color = holiday_flg)) +
geom_point(size = 2) +
theme(legend.position = "none") +
labs(x = "2017 date")
layout <- matrix(c(1,1,2,3),2,2,byrow=FALSE)
multiplot(p1, p2, p3, layout=layout)We find:
The same days were holidays in late Apr / May in 2016 as in 2017.
There are about 7% holidays in our data:
## frac
## 1 0.06769826
# Convert 'date' column to datetime if it's not already
holidays['date'] = pd.to_datetime(holidays['date'])
# Plot 1
p1_data = holidays['holiday_flg'].value_counts().reset_index()
p1_data.columns = ['holiday_flg', 'count']
p2_data = holidays[(holidays['date'] > '2016-04-15') & (holidays['date'] < '2016-06-01')]
p2_data['holiday_flg'] = p2_data['holiday_flg'].astype('category')
p3_data = holidays[(holidays['date'] > '2017-04-15') & (holidays['date'] < '2017-06-01')]
# Multiplot layout
fig = plt.figure(figsize=(6, 4))
gs = GridSpec(2, 2, figure=fig)
plt.style.use('ggplot')
locale.setlocale(locale.LC_ALL,'en_US.utf8')## 'en_US.utf8'
ax1 = fig.add_subplot(gs[:, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax3 = fig.add_subplot(gs[1, 1])
# Plot1
p1= sns.barplot(data=p1_data, x='holiday_flg', y='count', palette='viridis',
ax=ax1, legend=False)
p1= ax1.set_xlabel('Holiday Flag')
#p1= ax1.set_title("Distribution of Holiday Flag")
p1= plt.legend("")
# Plot 2
p2= sns.scatterplot(data=p2_data, x='date', y='holiday_flg', hue='holiday_flg',
palette='viridis', ax=ax2, legend=False)
p2.set_yticks([1.0, 0.0], ["True","False"])
p2= ax2.set_xlabel('2016')
p2= ax2.set_title("2016")
#p2 = ax2.set(xticklabels = p2_data['date'].dt.strftime('%d %b'))
p2= fig.autofmt_xdate()
p2= ax2.set_ylabel('')
p2= plt.legend("")
p2= ax2.xaxis.set_major_locator(mdates.AutoDateLocator())
# Plot 3
p3= sns.scatterplot(data=p3_data, x='date', y='holiday_flg', hue='holiday_flg', palette='viridis', ax=ax3, legend=False)
p3.set_yticks([1.0, 0.0], ["True","False"])
p3 = ax3.set(xticklabels = p3_data['date'].dt.strftime('%d %b'))
p3= fig.autofmt_xdate()
p3= ax3.set_ylabel('')
p3= ax3.xaxis.set_major_locator(mdates.AutoDateLocator())
p3= plt.xlabel("2017")
#p3= plt.title("Holiday Flag in 2017")
p3= plt.legend("")
plt.tight_layout( pad=0.4, w_pad=0.5, h_pad=1)
plt.show()## 0.068
The following plot visualises the time range of the train vs test data sets:
foo <- air_visits %>%
rename(date = visit_date) %>%
distinct(date) %>%
mutate(dset = "train")
bar <- test %>%
separate(id, c("foo", "bar", "date"), sep = "_") %>%
mutate(date = ymd(date)) %>%
distinct(date) %>%
mutate(dset = "test")
foo <- foo %>%
bind_rows(bar) %>%
mutate(year = year(date))
year(foo$date) <- 2017
foo %>%
filter(!is.na(date)) %>%
mutate(year = fct_relevel(as.factor(year), c("2017","2016"))) %>%
ggplot(aes(date, year, color = dset)) +
geom_point(shape = "|", size = 10) +
scale_x_date(date_labels = "%B", date_breaks = "1 month") +
#scale_y_reverse() +
theme(legend.position = "bottom", axis.text.x = element_text(angle=45, hjust=1, vjust=0.9)) +
labs(color = "Data set") +
guides(color = guide_legend(override.aes = list(size = 4, pch = 15)))# Renaming and selecting distinct dates for 'foo'
foo = air_visits.rename(columns={'visit_date': 'date'}).drop_duplicates(subset=['date'])
foo['dset'] = 'train'
# Processing 'bar'
test['date'] = pd.to_datetime(test['id'].str.split('_').str[-1])
bar = test[['date']].drop_duplicates()
bar['dset'] = 'test'
# Combining 'foo' and 'bar'
foo = pd.concat([foo, bar], ignore_index=True)
# Setting 'year' column
foo['year'] = foo['date'].dt.strftime('%Y').astype('category')
# Filtering out NA dates
foo = foo.dropna(subset=['date'])
# Adding a 'Months' column
foo['Months'] = foo['date'].dt.strftime('%B')
# Set up the plot
plt.figure(figsize=(8, 4))
#p= sns.set(rc={'figure.figsize':(4,2)})
plt.style.use('ggplot')
# Create a scatter plot using Seaborn
p= sns.scatterplot(data=foo, x='Months', y= 'year' , hue='dset', color= 'dset',
#palette={'train': 'blue', 'test': 'red'},
marker='*',
s=250)
# Format the tick labels to display the month name
#p.invert_yaxis()
p= plt.xticks(rotation=35)
p= plt.tick_params(axis='x', pad=-5, labelsize=8)
plt.margins(y=1)
plt.legend(bbox_to_anchor=(0.99, 0.01), loc='lower right', borderaxespad=0)
plt.show(p)After looking at every data set individually, let’s get to the real fun and start combining them. This will tell us something about the relations between the various features and how these relationsy might affect the visitor numbers. Any signal we find will need to be interpreted in the context of the individual feature distributions; which is why it was one of our first steps to study those.
Our first plot of the multi-feature space deals with the average number of air restaurant visitors broken down by type of cuisine; i.e. the air_genre_name. We use a facet plot to distinguish the time series for the 14 categories. Note the logarithmic y-axis:
foo <- air_visits %>%
left_join(air_store, by = "air_store_id")
foo %>%
group_by(visit_date, air_genre_name) %>%
summarise(mean_visitors = mean(visitors)) %>%
ungroup() %>%
ggplot(aes(visit_date, mean_visitors, color = air_genre_name)) +
geom_line() +
labs(y = "Average number of visitors to 'air' restaurants", x = "Date") +
theme(legend.position = "none") +
scale_y_log10() +
facet_wrap(~ air_genre_name)## `summarise()` has grouped output by 'visit_date'. You can override using the
## `.groups` argument.
We find:
The mean values range between 10 and 100 visitors per genre per day. Within each category, the long-term trend looks reasonably stable. There is an upward trend for “Creative Cuisine” and “Okonomiyaki” et al., while the popularity of “Asian” food has been declining since late 2016.
The low-count time series like “Karaoke” or “Asian” (see Fig. 6) are understandably more noisy than the genres with higher numbers of visitors. Still, “Asian” restaurants appear to be very popular despite (or because of?) their rarity.
In all of the curves we see the familiar weekly variation, so let’s look in more detail at the mean visitor numbers per week day and genre. We also add to this a series of ridgeline plots via the ggridges package. Ridgeline plots allow for a quick comparison of semi-overlapping (density) curves. Here we show the distribution of visitors per day for each genre. Through a little bit of ggplot magic, the y-axis labels in between those two plots refer to both of them:
p1 <- foo %>%
mutate(wday = wday(visit_date, label = TRUE, week_start = 1)) %>%
group_by(wday, air_genre_name) %>%
summarise(mean_visitors = mean(visitors)) %>%
ggplot(aes(air_genre_name, mean_visitors, color = wday)) +
geom_point(size = 4) +
theme(legend.position = "left", axis.text.y = element_blank(),
plot.title = element_text(size = 14)) +
coord_flip() +
labs(x = "") +
scale_x_discrete(position = "top") +
ggtitle("air_genre_name") +
scale_color_hue()## `summarise()` has grouped output by 'wday'. You can override using the
## `.groups` argument.
p2 <- foo %>%
ggplot(aes(visitors, air_genre_name, fill = air_genre_name)) +
geom_density_ridges(bandwidth = 0.1) +
scale_x_log10() +
theme(legend.position = "none") +
labs(y = "") +
scale_fill_cyclical(values = c("blue", "red"))
layout <- matrix(c(1,1,2,2,2),1,5,byrow=TRUE)
multiplot(p1, p2, layout=layout)Here each colour corresponds to a day of the week. Red-ish coulours are the weekend, while the cooler colours are the middle of the week. Monday is a dark orange.
We find:
The biggest difference between weekend and weekdays exists for the “Karaoke” bars, which rule the weekend. A similar trend, although with a considerably smaller gap, can be seen for the “International” cuisine.
No genre really goes against the trend of busier weekends. The smallest variations are in the generic “Other” category, the “Japanese” food, and also the “Korean” cuisine which is the only category where Fridays are the busiest days. General “Bars/Cocktail” are notably unpopular overall.
The density curves confirm the impression we got from the week-day distribution: the “Asian” restaurants have rarely less than 10 visitors per date and the “Karaoke” places show a very broad distribution due to the strong impact of the weekends. Note the logarithmic x-axis
# Merge 'air_visits' and 'air_store' on the column 'air_store_id'
foo = pd.merge(air_visits, air_store, on='air_store_id', how='left')
#Grouping and summarizing data
summary_data = foo.groupby(['visit_date', 'air_genre_name'],
observed=True)['visitors'].mean().reset_index()
fig = plt.figure(figsize = (4,2))
plt.style.use('ggplot')
# Set up the FacetGrid
g = sns.FacetGrid(summary_data, col="air_genre_name",
col_wrap=4, height=2, aspect= 1,
sharex=True)
# Create line plots for each genre
p = g.map_dataframe(sns.lineplot, x='visit_date',
y='visitors', hue='air_genre_name')
# Format the x-axis date labels
#g.set(xticks=pd.to_datetime(['2016-01', '2016-03', '2016-05', '2016-07', '2016-09', '2016-11', '2017-01']))
#fig.autofmt_xdate()
# Resize facet titles
p= g.set_titles(col_template="{col_name}", size = 8, backgroundcolor='#DDDDDD', pad=2
# bbox={'facecolor': '#DDDDDD', 'alpha': 0, 'pad': 0}
)
p= g.tick_params(axis='x', pad=0, labelsize= 6)
p= g.set_xlabels(fontsize=6, label="Date")
p= g.set_ylabels(fontsize=8 )#,label="Average number of visitors")
# Set labels and title
#g.set_axis_labels(x_var="Date",y_var= "Average number of visitors")
#g.add_legend(title="gendre")
for ax in g.axes.flat:
for label in ax.get_xticklabels():
label.set_rotation(45)
# Apply logarithmic scale to y-axis
p= g.set(yscale="log")
# Adjust the layout
p= g.tight_layout()
# Show the plot
plt.show(p)# Extracting weekday
foo['wday'] = foo['visit_date'].dt.day_name()
# Data Processing for p1
p1_data = foo.groupby(['wday', 'air_genre_name'], observed=True).agg(mean_visitors=('visitors', 'mean')).reset_index()
# Data Processing for p2 (Assuming 'foo' DataFrame is available)
p2_data = foo.copy()
# Apply necessary transformations to 'p2_data' DataFrame
# Create a 2x2 grid
fig = plt.figure(figsize=(12, 8))
gs = GridSpec(2, 2, figure=fig)
# Plot 1 (Top Left)
ax1 = fig.add_subplot(gs[:, 0])
sns.scatterplot(data=p1_data,
y='air_genre_name',
x='mean_visitors',
hue='wday',
size=4,
#palette='viridis',
ax=ax1)
# Set the figure size using rcParams
ax1.set_title('air_genre_name', fontsize=8)
ax1.set_xlabel('')
ax1.set_ylabel('')
ax1.legend(title='wday', loc='lower right')
#ax1.legend("")
ax2 = fig.add_subplot(gs[:, 1],sharex=True)## 'other' must be an instance of matplotlib.axes._base._AxesBase, not a bool
## Plot 2 (Top Right)
ax2 = fig.add_subplot(gs[:, 1])
for genre in p2_data['air_genre_name'].unique():
sns.kdeplot(data=p2_data, #p2_data[p2_data['air_genre_name'] == genre]
x='visitors',
hue='air_genre_name',
fill=True,
common_norm=False,
levels=30,
label=genre,
ax=ax2)
ax2.set_xlabel('')
ax2.set_ylabel('')
ax2.set_title('')
#ax2.legend("",loc=2)
p= ax2.set_xlim(-50, 200)
# Adjust layout
p= plt.tight_layout()
# Show the plots
plt.show(p)# Data Processing for p2 (Assuming 'foo' DataFrame is available)
p2_data = foo.copy()
#fig = plt.figure(figsize=(4, 8))
# Plot 2 (Bottom)
#pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(p2_data,
row="air_genre_name",
hue="air_genre_name",# palette=pal,
aspect=15, height=.5
)
p= g.map(sns.kdeplot, 'visitors',
bw_adjust=.5, clip_on=False,
fill=True, alpha=1, linewidth=.5)
p= g.map(sns.kdeplot, 'visitors', clip_on=False, color="w", lw=2, bw_adjust=.5)
# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
ax = plt.gca()
ax.text(0, .2, label, color=color, #fontweight="bold",
ha="right", va="center", transform=ax.transAxes)
p= g.map(label, "visitors")
p= g.set(xlim=(-5, 80))
p= g.set_titles("")
p= g.set(yticks=[], ylabel="")
p= g.despine(bottom=True, left=True)
p= g.fig.subplots_adjust(hspace=-.25)
# Show the plots
plt.show(p)